Single-cell RNA-Seq analysis pipeline (GEO format)

This is a pipeline to analyze single-cell RNA Seq data from GEO. In this script the single cell RNA-Seq results from this paper is regenerated: https://pubmed.ncbi.nlm.nih.gov/34956864/ Title: Bulk and Single-Cell Profiling of Breast Tumors Identifies TREM-1 as a Dominant Immune Suppressive Marker Associated With Poor Outcomes

Initial library loading

library(Seurat)
library(tidyverse)
library(GEOquery)

0. Fetch data from GEO

No need to perform the following two lines next time, as the folders are already created.

file <- getGEOSuppFiles("GSE188600")
untar("GSE188600/GSE188600_RAW.tar", exdir = 'data/')

1. Create the counts matrix

find the files in data folder (i.e., barcodes, features, matrix)

wd = getwd()
files = list.files(path = paste0(wd, '/data'), full.names = FALSE, recursive = FALSE)

mtx.cnts <- ReadMtx(mtx = paste0('data/', files[3]),
                features = paste0('data/', files[2]),
                cells = paste0('data/', files[1]))

2. create a seurat object

Seurat.obj <- CreateSeuratObject(counts = mtx.cnts)

Seurat.obj
## An object of class Seurat 
## 33694 features across 770 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)

3. QC and filtering

view Seurat object meta data

view(Seurat.obj@meta.data)

3.1 calculate mitochondrial percentage

high percentage shows bad quality

Seurat.obj$mito.prct <- PercentageFeatureSet(Seurat.obj, pattern = '^MT-')

3.2 explore QC

plot feature and RNA counts and mitochondial percentage

VlnPlot(Seurat.obj, features = c("nFeature_RNA", "nCount_RNA", "mito.prct"), ncol = 3)

FeatureScatter(Seurat.obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +
  geom_smooth(method = 'lm')

3.3 filtering cells

more than 800 RNA count and more than 500 genes and less than 10 mitochondrial%

Seurat.obj.filt <- subset(Seurat.obj, subset = nCount_RNA >800 &
                          nFeature_RNA > 500 &
                            mito.prct < 10)

4. Finding Variable genes

note: This data set contains one sample, if more than 1 sample was used correction for potential batch effects should be considered

Seurat.obj.filt <- NormalizeData(object = Seurat.obj.filt)

variable genes - with visualization of top 10 variable genes

Seurat.obj.filt <- FindVariableFeatures(object = Seurat.obj.filt, selection.method = 'vst', nfeatures = 2000)

top10 <- head(VariableFeatures(Seurat.obj.filt), 10)

plot1 <- VariableFeaturePlot(Seurat.obj.filt)
LabelPoints(plot = plot1, points = top10, repel = TRUE)

5. Clustering the cells

scaling the data

Seurat.obj.filt <- ScaleData(object = Seurat.obj.filt)

perform linear dimensionality reduction

Seurat.obj.filt <- RunPCA(object = Seurat.obj.filt, features = VariableFeatures((Seurat.obj.filt)))

selecting the PCA plots with elbow plot

ElbowPlot(Seurat.obj.filt, ndims = 35)

Find Neighbors

Seurat.obj.filt <- FindNeighbors(object = Seurat.obj.filt, dim = 1:20)

understanding the resolution

Seurat.obj.filt <- RunUMAP(object = Seurat.obj.filt, dim = 1:20)
Seurat.obj.filt <- FindClusters(object = Seurat.obj.filt, resolution = c(0.01, 0.1, 0.3, 0.5, 0.8, 1, 1.2))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 736
## Number of edges: 21217
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9932
## Number of communities: 3
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 736
## Number of edges: 21217
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9618
## Number of communities: 4
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 736
## Number of edges: 21217
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9111
## Number of communities: 7
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 736
## Number of edges: 21217
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8790
## Number of communities: 8
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 736
## Number of edges: 21217
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8422
## Number of communities: 11
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 736
## Number of edges: 21217
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8190
## Number of communities: 11
## Elapsed time: 0 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 736
## Number of edges: 21217
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7959
## Number of communities: 11
## Elapsed time: 0 seconds

This resolution should be changed until the correct number of clusters are achieved

DimPlot(Seurat.obj.filt, group.by = 'RNA_snn_res.0.5', label = TRUE)

setting identity of clusters

Seurat.obj.filt <- RunUMAP(object = Seurat.obj.filt, dim = 1:20)
Seurat.obj.filt <- RunTSNE(object = Seurat.obj.filt, dims = 1:20)
Idents(Seurat.obj.filt) <- 'RNA_snn_res.0.5'

umap observation

DimPlot(Seurat.obj.filt, reduction = 'umap', label = TRUE)

tsne observation

DimPlot(Seurat.obj.filt, reduction = 'tsne' , label = TRUE)

Find optimal number of clusters

DefaultAssay(Seurat.obj.filt) # make sure it is RNA
## [1] "RNA"

find markers for every cluster compared to all remaining cells, report only the positive ones

Seurat.obj.filt.markers <- FindAllMarkers(Seurat.obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

select top (4 -> can be changed) upregulated genes in each cluster

clust.markers <- Seurat.obj.filt.markers %>%
  group_by(cluster) %>%
  slice_max(n = 4, order_by = avg_log2FC)

6. Steps to complete the annotation

Visualize top upregulated genes in each cluster

DoHeatmap(Seurat.obj.filt, features = clust.markers$gene, size = 4,
          angle = 90)

Ridge plots - from ggridges.

Visualize single cell expression distributions in each cluster

RidgePlot(Seurat.obj.filt, features = clust.markers$gene, ncol = 6)

Violin plot

Visualize single cell expression distributions in each cluster

VlnPlot(Seurat.obj.filt, features = clust.markers$gene, ncol = 4)

Feature plot

visualize feature expression in low-dimensional space

FeaturePlot(Seurat.obj.filt, features = clust.markers$gene, ncol = 4)

Check individual feature

FeaturePlot(Seurat.obj.filt, features = c('CD163'), min.cutoff = 'q10')

feature plots (feature identification based on up-reg genes and pangloadb)

features <- c("CST3", "CD86", "SPP1", "C1QA", "CTHRC1", "MGP", "CXCL13", "TNFRSF18", "JCHAIN", "IGKC",
              "NKG7", "GNLY", "IL7R", "RPL34", "STMN1", "KIAA0101")

assigning the new clusters to data

new.cluster.ids <- c("Mature DC", "DC", "Fibroblast", "T cell", "Immature DC",
                     "NK cells", "Naive T cell", "TNBC")

view new plot with cluster names

names(new.cluster.ids) <- levels(Seurat.obj.filt)
Seurat.obj.filt <- RenameIdents(Seurat.obj.filt, new.cluster.ids)
DimPlot(Seurat.obj.filt, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

DimPlot(Seurat.obj.filt, reduction = "umap")

Single cell heatmap of feature expression

DoHeatmap(subset(Seurat.obj.filt, downsample = 700), features = features, size = 3)

# Feature plot - visualize feature expression in low-dimensional space

FeaturePlot(Seurat.obj.filt, features = features, ncol = 4)

select top (8 -> can be changed) upregulated genes in each cluster

top.features <- Seurat.obj.filt.markers %>%
  group_by(cluster) %>%
  slice_max(n = 6, order_by = avg_log2FC)
DoHeatmap(subset(Seurat.obj.filt, downsample = 700), features = top.features$gene, size = 3)

Dot plots - the size of the dot corresponds to the percentage of cells expressing the

feature in each cluster. The color represents the average expression level

DotPlot(Seurat.obj.filt, features = features) + RotatedAxis()